---
title: "Highways Earmarks Paper"
author: "Peter McLaughlin"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
library(readxl)
library(stargazer)
library(reshape2)
library(MASS)
library(pscl)
library(broom)
library(DAMisc)
library(strucchange)
library(gap)
library(jtools)
library(interactions)
library(Hmisc)
library(tidyverse)
library(psych)
library(car)
library(plotly)
library(ggplot2) # pretty plots
library(RColorBrewer) # pretty colors
library(ggExtra) # axes and labeling
library(showtext) # use Google Fonts
library(kableExtra) # table construction
library(robustbase)
library(cowplot)
library(clubSandwich)
library(plm)
library(table1)


da <- read_excel("Spending_Data.xlsx")


state_data <- read_excel("State_Data.xlsx")

state_data$STATE = toupper(state_data$STATE)

da <- left_join(da, state_data, by = c('STATE'))

# Divide diff_in_amount and htf_diff variables by 1 million to make visuals/tables more interpretable 
da$diff_in_amount <- da$diff_in_amount / 1000000
da$htf_diff <- da$htf_diff / 1000000

da$nine_actual <- da$nine_actual / 1000000000

da$pop <- da$pop / 100000
da$pop_e <- da$pop_e / 100000
da$vmt <- da$vmt / 10000
da$vmt_e <- da$vmt_e / 10000
da$fah_miles <- da$fah_miles /10000
da$fah_miles_e <- da$fah_miles_e /10000
da$pop_diff <- da$pop_diff / 100000
da$pop_diff_e <- da$pop_diff_e / 100000
da$vmt_diff <-  da$vmt_diff / 10000
da$vmt_diff_e <-  da$vmt_diff_e / 10000
da$fah_diff <- da$fah_diff /10000 
da$fah_diff_e <- da$fah_diff_e /10000
```


```{r Ratio Setup and Models, include=FALSE}

#Specify data set to be used and filter to be 2010 & 2020
ds <- da %>%
  filter(da$year %in% c(2010, 2020)) %>%
  dplyr::select(STATE, year, pop, percent_ear, ratio, Region, square_miles, pop_diff, vmt_diff, vmt, fah_diff, fah_miles, nine_actual, ear_per_cap)%>%
  na.omit()

# transform data long to wide for ratio DV
ds.ratio <- spread(ds, year, ratio)

ds.ratio <- fastDummies::dummy_cols(ds.ratio, select_columns = "Region")
ds.ratio$RegionNortheast <- ds.ratio$Region_Northeast
ds.ratio$RegionSouth <- ds.ratio$Region_South
ds.ratio$RegionWest <- ds.ratio$Region_West

# Rename 2010 to ten, and 2020 to twenty
names(ds.ratio)[names(ds.ratio)=="2010"] <- "ten"
names(ds.ratio)[names(ds.ratio)=="2020"] <- "twenty"

# create measure for ratio change from 2010 to 2020
ds.ratio$change <- (ds.ratio$twenty - ds.ratio$ten)

# Density plot for change from 2010 to 2020
ggplot(ds.ratio, aes(x=change)) +
   geom_density(fill='darkgrey', alpha=0.5) +
 xlab("\nRatio Change") +
  ggtitle("Formula Ratio")+
  ylab("Density\n")+
  theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=16, family = ("Sabon")))-> formula_ratio_c

# Robust Regression
mod_form_ratio_rob <- rlm(change ~ percent_ear + log(pop) + log(fah_miles) + log(vmt), 
                   data = ds.ratio)


stargazer(mod_form_ratio_rob,  
          type = "text", single.row = TRUE)

```


```{r Ratio Change Sim, include=FALSE}

#create range for IV of interest
ear_range <- seq(min(ds.ratio$percent_ear), max(ds.ratio$percent_ear), by = .1)

# Simulating from the distribution of the betas
rratio.pe <- mod_form_ratio_rob$coefficients # the betas
rratio.vc <- vcov(mod_form_ratio_rob) # var-cov matrix
rratio.se <- sqrt(diag(rratio.vc)) # standard errors
rratio.s2 <- summary(mod_form_ratio_rob)$sigma**2 # the variance or sigma^2

sim_n <- 10000 # 10k simulations
rratio.simbs <- mvrnorm(sim_n, rratio.pe, rratio.vc)
rratio.simbs <- data.frame(rratio.simbs)
colnames(rratio.simbs)[1] <- "intercept" # a nicer name
head(rratio.simbs) # take a look

rratio.x_mean <- c(
  1, # this is the identity placeholder for the intercept
  mean(ds.ratio$percent_ear), # the mean of x_1
  mean(log(ds.ratio$pop)),
  mean(log(ds.ratio$fah_miles)),
  mean(log(ds.ratio$vmt)))

rratio.cf_data <- matrix(rratio.x_mean, nrow = length(ear_range), ncol = 5, byrow = TRUE)
rratio.cf_data[,2] <- ear_range

rratio.sim_betas <- mvrnorm(sim_n, rratio.pe, rratio.vc) # simulate betas
rratio.mu <- rratio.sim_betas%*%t(rratio.cf_data) # systematic component mu; transpose for comformability
rratio.s2 <- summary(mod_form_ratio_rob)$sigma**2 # stochastic component sigma^2

rratio.cf_y <- rratio.mu + sqrt(rratio.s2) # create the simulated $y$ values

rratio.ev <- NULL
rratio.lci <- NULL
rratio.uci <- NULL

rratio.ev <- apply(rratio.cf_y, 2, mean)
rratio.lci <- apply(rratio.cf_y, 2, quantile, probs = .025)
rratio.uci <- apply(rratio.cf_y, 2, quantile, probs = .975)
rratio.sim.result <- data.frame(cbind(ear_range,rratio.ev,rratio.lci,rratio.uci))

ggplot(data = rratio.sim.result, aes(x = ear_range)) + 
  xlab("\nPercent Earmarks") +
  ylab("Ratio Change\n") +
   ggtitle("Formula Ratio")+
  geom_line(aes(y = rratio.ev)) +
  geom_ribbon(aes(ymin = rratio.lci, ymax = rratio.uci), 
    alpha = 0.4)+
      theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=16, family = ("Sabon"))) -> c_ratio_rob


# Result interpretation

summary(ds.ratio$percent_ear)
# 1st Quartile = 8.9 ; 3rd Quartile = 16.7

rratio.sim.result
# 8.9 = .04 ; 16.7 = 0.07

# Mean 2020 formula prescribed value
formula_2020 <- da %>%
  filter(da$year %in% c(2020)) %>%
  dplyr::select(Formula)%>%
  na.omit()

describe(formula_2020$Formula)
# 850388124

# Diference between 0.07 ratio increase and 0.04 ratio increase
(850388124 + (850388124*0.07)) - (850388124 + (850388124*0.04))
# 25511644
```

```{r Difference Setup and Models, include=FALSE}

#Specify data set to be used and filter to be 2010 & 2020
ds.diff <- da %>%
  filter(da$year %in% c(2010, 2020)) %>%
  dplyr::select(STATE, year, pop, percent_ear, pop, diff_in_amount, Region, square_miles, pop_diff, vmt, vmt_diff, fah_diff, fah_miles, nine_actual, ear_per_cap)%>%
  na.omit()

# transform data long to wide for ratio DV
ds.diff <- spread(ds.diff, year, diff_in_amount)

ds.diff <- fastDummies::dummy_cols(ds.diff, select_columns = "Region")
ds.diff$RegionNortheast <- ds.diff$Region_Northeast
ds.diff$RegionSouth <- ds.diff$Region_South
ds.diff$RegionWest <- ds.diff$Region_West

# Rename 2010 to ten, and 2020 to twenty
names(ds.diff)[names(ds.diff)=="2010"] <- "ten"
names(ds.diff)[names(ds.diff)=="2020"] <- "twenty"

# create measure for ratio change
ds.diff$change <- (ds.diff$twenty - ds.diff$ten)

# Density plot
ggplot(ds.diff, aes(x=change)) +
   geom_density(fill='darkgrey', alpha=0.5) +
 xlab("\nDollar Change (Millions)") +
  ggtitle("Formula Difference")+
  ylab("\nDensity\n")+
  theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=16, family = ("Sabon"))) -> formula_diff_c

# Robust Regression
mod_form_diff_rob <- rlm(change ~ percent_ear + log(pop) + log(fah_miles) + log(vmt), 
                   data = ds.diff)

stargazer(mod_form_diff_rob, type = "text", single.row = TRUE)
```



```{r Difference Change Sim, include=FALSE}
#create range for IV of interest
ear_range <- seq(min(ds.diff$percent_ear), max(ds.diff$percent_ear), by = .1)

# Simulating from the distribution of the betas
rd.pe <- mod_form_diff_rob$coefficients # the betas
rd.vc <- vcov(mod_form_diff_rob) # var-cov matrix, remember from Joe's class
rd.se <- sqrt(diag(rd.vc)) # standard errors
rd.s2 <- summary(mod_form_diff_rob)$sigma**2 # the variance or sigma^2

sim_n <- 10000 # 10k simulations
rd.simbs <- mvrnorm(sim_n, rd.pe, rd.vc)
rd.simbs <- data.frame(rd.simbs)
colnames(rd.simbs)[1] <- "intercept" # a nicer name
head(rd.simbs) # take a look

rd.x_mean <- c(
  1, # this is the identity placeholder for the intercept
  mean(ds.diff$percent_ear), # the mean of percent ear
  mean(log(ds.ratio$pop)),
  mean(log(ds.ratio$fah_miles)),
  mean(log(ds.ratio$vmt)))

rd.cf_data <- matrix(rd.x_mean, nrow = length(ear_range), ncol = 5, byrow = TRUE)
rd.cf_data[,2] <- ear_range

rd.sim_betas <- mvrnorm(sim_n, rd.pe, rd.vc) # simulate betas
rd.mu <- rd.sim_betas%*%t(rd.cf_data) # systematic component mu; transpose for comformability
rd.s2 <- summary(mod_form_diff_rob)$sigma**2 # stochastic component sigma^2

rd.cf_y <- rd.mu + sqrt(rd.s2) # create the simulated $y$ values

rd.ev <- NULL
rd.lci <- NULL
rd.uci <- NULL

rd.ev <- apply(rd.cf_y, 2, mean)
rd.lci <- apply(rd.cf_y, 2, quantile, probs = .025)
rd.uci <- apply(rd.cf_y, 2, quantile, probs = .975)
rd.sim.result <- data.frame(cbind(ear_range,rd.ev,rd.lci,rd.uci))

ggplot(data = rd.sim.result, aes(x = ear_range)) + 
  xlab("\nPercent Earmarks") +
  ylab("\nDollar Change (Millions)\n") +
  ggtitle("Formula Difference")+
  geom_line(aes(y = rd.ev)) +
  geom_ribbon(aes(ymin = rd.lci, ymax = rd.uci), 
    alpha = 0.4)+
      theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=16, family = ("Sabon"))) -> c_diff_rob
```

```{r HTF Ratio Setup and Models, include=FALSE}

#Specify data set to be used and filter to be 2010 & 2020
ds.htf <- da %>%
  filter(da$year %in% c(2010, 2020)) %>%
  dplyr::select(STATE, year, pop, percent_ear, pop, htf_return, Region, square_miles, pop_diff, vmt_diff, vmt, fah_diff, fah_miles, nine_actual, ear_per_cap)%>%
  na.omit()

# transform data long to wide for ratio DV
ds.htf <- spread(ds.htf, year, htf_return)

ds.htf <- fastDummies::dummy_cols(ds.htf, select_columns = "Region")
ds.htf$RegionNortheast <- ds.htf$Region_Northeast
ds.htf$RegionSouth <- ds.htf$Region_South
ds.htf$RegionWest <- ds.htf$Region_West

# Rename 2010 to ten, and 2020 to twenty
names(ds.htf)[names(ds.htf)=="2010"] <- "ten"
names(ds.htf)[names(ds.htf)=="2020"] <- "twenty"

# create measure for ratio change
ds.htf$change <- (ds.htf$twenty - ds.htf$ten)


# Density plot for change from 2010 to 2020
ggplot(ds.htf, aes(x=change)) +
   geom_density(fill='darkgrey', alpha=0.5) +
 xlab("HTF Return Change") +
  ggtitle("HTF Return")+
  ylab("Density")+
  theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=18, family = ("serif")))

#Log both mallaporttionment measures to reduce DV skew

ds.htf$change_log <- (log(ds.htf$twenty) - log(ds.htf$ten))

ggplot(ds.htf, aes(x=change_log)) +
   geom_density(fill='darkgrey', alpha=0.5) +
 xlab("Logged Ratio Change") +
  ggtitle("HTF Ratio")+
  ylab("Density")+
  theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=18, family = ("serif"))) -> htf_c

# Robust Regression
mod_htf_ratio_rob <- rlm(change_log ~ percent_ear + log(pop) + log(fah_miles) + log(vmt), 
                   data = ds.htf)

stargazer(mod_htf_ratio_rob,  
          type = "text", single.row = TRUE)
```



```{r HTF Difference Setup and Models, include=FALSE}

#Specify data set to be used and filter to be 2010 & 2020
ds <- da %>%
  filter(da$year %in% c(2010, 2020)) %>%
  dplyr::select(STATE, year, pop, percent_ear, pop, htf_diff, Region, square_miles, pop_diff, vmt_diff, vmt, fah_diff, fah_miles, nine_actual, ear_per_cap)%>%
  na.omit()

# transform data long to wide for htf_diff DV
ds.htf_diff <- spread(ds, year, htf_diff)

ds.htf_diff <- fastDummies::dummy_cols(ds.htf_diff, select_columns = "Region")
ds.htf_diff$RegionNortheast <- ds.htf_diff$Region_Northeast
ds.htf_diff$RegionSouth <- ds.htf_diff$Region_South
ds.htf_diff$RegionWest <- ds.htf_diff$Region_West

# Rename 2010 to ten, and 2020 to twenty
names(ds.htf_diff)[names(ds.htf_diff)=="2010"] <- "ten"
names(ds.htf_diff)[names(ds.htf_diff)=="2020"] <- "twenty"

# create measure for htf_diff change from 2010 to 2020
ds.htf_diff$change <- (ds.htf_diff$twenty - ds.htf_diff$ten)

# Density plot 
ggplot(ds.htf_diff, aes(x=change)) +
   geom_density(fill='darkgrey', alpha=0.5) +
  xlab("Dollar Change (Millions)") +
  ggtitle("HTF Difference")+
  ylab("Density")+
  theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=18, family = ("serif"))) -> htf_diff_c

# Robust Regression
mod_htf_diff_rob <- rlm(change ~ percent_ear + log(pop) + log(fah_miles) + log(vmt), 
                   data = ds.htf_diff)


stargazer(mod_htf_diff_rob,   
          type = "text", single.row = TRUE)

```


```{r 2005-2009 HTF Ratio Models, include=FALSE}

#Specify data set to be used and filter to be 2005 & 2009
ds.htf.s <- da %>%
  filter(da$year %in% c(2005, 2009)) %>%
  dplyr::select(STATE, year, pop, percent_ear, pop_e, htf_return, Region, square_miles, pop_diff_e, vmt_diff_e, vmt_e, fah_diff_e, fah_miles_e, ear_per_cap)%>%
  na.omit()

# transform data long to wide for ratio DV
ds.htf.s <- spread(ds.htf.s, year, htf_return)

ds.htf.s <- fastDummies::dummy_cols(ds.htf.s, select_columns = "Region")
ds.htf.s$RegionNortheast <- ds.htf.s$Region_Northeast
ds.htf.s$RegionSouth <- ds.htf.s$Region_South
ds.htf.s$RegionWest <- ds.htf.s$Region_West

# Rename 2005 to five, and 2009 to nine
names(ds.htf.s)[names(ds.htf.s)=="2005"] <- "five"
names(ds.htf.s)[names(ds.htf.s)=="2009"] <- "nine"

# create measure for ratio change
ds.htf.s$change <- (ds.htf.s$five - ds.htf.s$nine)

# Density plot for change from 2005 to 2009
ggplot(ds.htf.s, aes(x=change)) +
   geom_density(
    fill="#69b3a2", 
    color="#e9ecef", 
    alpha=0.6) +
 xlab("HTF Return Change") +
  ggtitle("HTF Return")+
  ylab("Density")

#Log both DVs to make them less skewed
ds.htf.s$change_log <- (log(ds.htf.s$nine) - log(ds.htf.s$five))

ggplot(ds.htf.s, aes(x=change_log)) +
  geom_density(fill='darkgrey', alpha=0.5) +
  xlab("Ratio Change") +
  ggtitle("HTF Ratio (Logged)")+
  ylab("Density")+
  theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=18, family = ("serif"))) -> ratio_change_2009

#  Robust Regression
mod_htf_ratio_rob.s <- rlm(change_log ~ percent_ear + log(pop_e) + log(fah_miles_e) + log(vmt_e), 
                   data = ds.htf.s)

stargazer(mod_htf_ratio_rob.s,  
          type = "text", single.row = TRUE)
```



```{r 2005-2009 HTF Difference Models, include=FALSE}

#Specify data set to be used and filter to be 2005 & 2009
ds <- da %>%
  filter(da$year %in% c(2005, 2009)) %>%
  dplyr::select(STATE, year, pop, percent_ear, pop_e, htf_diff, Region, square_miles, pop_diff_e, vmt_diff_e, vmt_e, fah_diff_e, fah_miles_e, ear_per_cap)%>%
  na.omit()

# transform data long to wide for htf_diff DV
ds.htf_diff.s <- spread(ds, year, htf_diff)

ds.htf_diff.s <- fastDummies::dummy_cols(ds.htf_diff.s, select_columns = "Region")
ds.htf_diff.s$RegionNortheast <- ds.htf_diff.s$Region_Northeast
ds.htf_diff.s$RegionSouth <- ds.htf_diff.s$Region_South
ds.htf_diff.s$RegionWest <- ds.htf_diff.s$Region_West

# Rename 2010 to ten, and 2020 to twenty
names(ds.htf_diff.s)[names(ds.htf_diff.s)=="2005"] <- "five"
names(ds.htf_diff.s)[names(ds.htf_diff.s)=="2009"] <- "nine"

# create measure for htf_diff change from 2005 to 2009
ds.htf_diff.s$change <- (ds.htf_diff.s$nine - ds.htf_diff.s$five)


# Density plot for change from 2005 to 2009
ggplot(ds.htf_diff.s, aes(x=change)) +
  geom_density(fill='darkgrey', alpha=0.5) +
xlab("Dollar Change (Millions)") +
  ggtitle("HTF Difference")+
  ylab("Density")+
  theme_minimal()+
    theme(legend.title=element_blank())+
     theme(text = element_text(size=18, family = ("serif"))) -> diff_change_2009

# Robust Regression
mod_htf_diff_rob.s <- rlm(change ~ percent_ear + log(pop_e) + log(fah_miles_e) + log(vmt_e), 
                   data = ds.htf_diff.s)

stargazer(mod_htf_diff_rob.s, type = "text", single.row = TRUE)
```


```{r Tables, include=TRUE, results='asis'}

# Robust Ratio Only (Table 1)
stargazer(mod_form_ratio_rob, mod_form_diff_rob,   type = "text", single.row=FALSE, digits=3,
  column.labels=c("Formula Ratio", "Formula Difference\\\\ &  & (In Millions)"),
  covariate.labels=c("Percent Earmarks","Population (Logged)", "Federal Aid Highway Miles (Logged)", 
                     "Vehicle Miles Traveled (Logged)", "Constant"),
          title = "Robust Regression: Earmarks and Highway Funding Apportionment Change",
          dep.var.labels.include = FALSE)


#  HTF Change from 2010 to 2020 
stargazer(mod_htf_ratio_rob, mod_htf_diff_rob,   type = "text", single.row=FALSE, digits=3,
  column.labels=c("HTF Ratio", "HTF Difference \\\\ & & (In Millions)"),
  covariate.labels=c("Percent Earmarks","Population (Logged)", "Federal Aid Highway Miles (Logged)", 
                     "Vehicle Miles Traveled (Logged)", "Constant"),
          title = "Robust Regression: Earmarks and Highway Funding Apportionment Change",
          dep.var.labels.include = FALSE)

# HTF Change from 2005 to 2009 
stargazer(mod_htf_ratio_rob.s, mod_htf_diff_rob.s,   type = "text", single.row=FALSE, digits=3,
  column.labels=c("HTF Ratio", "HTF Difference \\\\ &  & (In Millions)"),
  covariate.labels=c("Percent Earmarks","Population (Logged)", "Federal Aid Highway Miles (Logged)", 
                     "Vehicle Miles Traveled (Logged)", "Constant"),
          title  = "Robust Regression: Earmarks and Highway Funding Apportionment Change",
          dep.var.labels.include = FALSE)

```


```{r Figures, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)

###2010-2020 

# Figure 1: DV Density 

plot_grid(formula_ratio_c, formula_diff_c) # Size: 3 x 8.5


# Figure 2: Simulaiton Plots

plot_grid(c_ratio_rob, c_diff_rob) # Size: 4 x 7.5

```


```{r Descriptive Stats, include=FALSE, results='asis'}
knitr::opts_chunk$set(echo = FALSE)

ds.ratio$Region <- car::recode(ds.ratio$Region, "1='Northeast';2='Midwest'; 3='South'; 4='West'")

dvs <- da %>%
  filter(da$year %in% c(2010)) %>%
  dplyr::select(STATE)%>%
  na.omit()

dvs$form_ratio <- ds.ratio$change  
dvs$form_diff <- ds.diff$change
dvs$htf_ratio <- ds.htf$change_log
dvs$htf_diff <- ds.htf_diff$change

# tables

dvs <- subset(dvs, select = -c(STATE))

dvs <- data.frame(dvs)

# A1
stargazer(dvs, type = "text",  covariate.labels = 
            c("Formula Ratio Drift", "Formula Difference Drift", "HTF Ratio Drift", "HTF Difference Drift"))

covars <- da %>%
  filter(da$year %in% c(2010)) %>%
  dplyr::select(STATE, percent_ear, pop, fah_miles,  vmt)%>%
  na.omit()

covars$log_pop <- log(covars$pop)  
covars$log_fah <- log(covars$fah_miles)
covars$log_vmt <- log(covars$vmt)

covars <- subset(covars, select = -c(STATE))

covars <- data.frame(covars)

#A2
stargazer(covars, type = "text",  covariate.labels = 
            c("Percent Earmark", "Population", "Federal Aid Highway Miles", "Vehicle Miles Traveled", "Population (Logged)", "Federal Aid Highway Miles (Logged)", "Vehicle Miles Traveled (Logged)"))

dvs_e <- da %>%
  filter(da$year %in% c(2005)) %>%
  dplyr::select(STATE)%>%
  na.omit()

dvs_e$ds.htf.s <- ds.htf$change_log
dvs_e$htf_diff <- ds.htf_diff.s$change

dvs_e <- subset(dvs_e, select = -c(STATE))

dvs_e <- data.frame(dvs_e)

#A3
stargazer(dvs_e, type = "text",  covariate.labels = 
            c("HTF Ratio Drift", "HTF Difference Drift"))


covars_e <- da %>%
  filter(da$year %in% c(2005)) %>%
  dplyr::select(STATE, percent_ear, pop_e, fah_miles_e,  vmt_e)%>%
  na.omit()

covars_e$log_pop_e <- log(covars_e$pop_e)  
covars_e$log_fah_e <- log(covars_e$fah_miles_e)
covars_e$log_vmt_e <- log(covars_e$vmt_e)

covars_e <- subset(covars_e, select = -c(STATE))

covars_e <- data.frame(covars_e)

#A4
stargazer(covars_e, type = "text",  covariate.labels = 
            c("Percent Earmark", "Population", "Federal Aid Highway Miles", "Vehicle Miles Traveled", "Population (Logged)", "Federal Aid Highway Miles (Logged)", "Vehicle Miles Traveled (Logged)"))

```
